#pragma once
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "tensor.h"
#include "vector_utils.h"
#include "norm_weight.h"

template<typename T>
void launchRMSNorm(TensorWrapper<T> *decoder_out,
                   TensorWrapper<T> *decoder_residual,
                   NormWeight<T> &norm_weight,
                   float eps,
                   bool is_last = false);